Making date.factor variable :
data.ww2$date.factor <- 0
i=0
for ( i in 1:10586){
if (data.ww2[i,2] <= 1949 ) {
data.ww2[i,8] <- '1940s'}
else if (data.ww2[i,2] <= 1959){
data.ww2[i,8] <- '1950s'
}
else if (data.ww2[i,2] <= 1969){
data.ww2[i,8] <- '1960s'
}
else if (data.ww2[i,2] <= 1979){
data.ww2[i,8] <- '1970s'
}
else if (data.ww2[i,2] <= 1989){
data.ww2[i,8] <- '1980s'
}
else if (data.ww2[i,2] <= 1999){
data.ww2[i,8] <- '1990s'
}
else if (data.ww2[i,2] <= 2009){
data.ww2[i,8] <- '2000s'
}
else{ data.ww2[i,8] <- '2010s'
}
}
data.ww2$date<-as.factor(data.ww2$date)
data.ww2$date.factor<-as.factor(data.ww2$date.factor)
str(data.ww2)
## 'data.frame': 10586 obs. of 8 variables:
## $ key : chr "Afghanistan - 1940" "Albania - 1940" "Algeria - 1940" "Angola - 1940" ...
## $ date : Factor w/ 79 levels "1940","1941",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ country : Factor w/ 193 levels "Afghanistan",..: 1 2 3 5 7 9 10 13 14 17 ...
## $ continent : Factor w/ 5 levels "Africa","Americas",..: 3 4 1 1 2 5 4 3 3 4 ...
## $ gdp.number : int 976 1820 3500 2330 7290 10100 7530 5570 1180 7090 ...
## $ life.expectancy.number: num 31.4 42.2 37.1 33 58.4 66.5 57.8 34.4 33.5 56 ...
## $ population.number : int 7030000 1120000 7800000 4090000 14200000 7050000 6700000 105000 37400000 8340000 ...
## $ date.factor : Factor w/ 8 levels "1940s","1950s",..: 1 1 1 1 1 1 1 1 1 1 ...
Seeing plots by ggpairs:
ggpairs(data.ww2[,c(5:6,8)])
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Seeing plots by plotly:
#plot_ly(data.ww2, x= ~log(gdp.number) ,y=~life.expectancy.number , z=~date, type= 'scatter3d' , marker= list(size=1.5))
data.ww2.lm = lm(life.expectancy.number ~ log10(gdp.number) + date, data = data.ww2 )
data.ww2.lm.df = augment(data.ww2.lm)
colnames(data.ww2.lm.df) = c("life.expectancy.number" , "gdp.number" , "date" , ".fitted" , ".se.fit" , ".resid" , ".hat" , ".sigma" ,".cooksd" , ".std.resid" )
ggplot(data.ww2.lm.df, aes(x = gdp.number, y = .resid)) +
geom_smooth(se=F) +geom_point(alpha=0.1) +
ggtitle('Residual Plot for Linear Model ') +
labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data.ww2.lm.df, aes(x = .fitted, y = abs(.resid))) +
geom_smooth(se=F) +geom_point(alpha=0.1)+
ggtitle('Residual & Fitted Plot for Linear Model ') +
labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
-not good residual plot
Make predictions:
ww2.lm.grid <-expand.grid(gdp.number = seq(5.5, 11.6, 0.1), date = seq(1940,2018,1))
ww2.lm.grid$date<-as.factor(ww2.lm.grid$date)
ww2.lm.grid.predict = predict(data.ww2.lm, newdata = ww2.lm.grid)
ww2.lm.grid.predict = data.frame(ww2.lm.grid, life.expectancy = as.vector(ww2.lm.grid.predict))
#ggplot(ww2.lm.grid.predict, aes(x = gdp.number, y = life.expectancy)) + geom_line() +facet_wrap(~date)
ggplot(ww2.lm.grid.predict, aes(x = gdp.number, y = life.expectancy, group = date, color = date)) + geom_line() +
ggtitle('Predicted Life Expectancy per GDP after WW2' ) +
labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model with seperate dates')
-looks different in every date / date has effect!
data.ww2.lm.f = lm(life.expectancy.number ~ log10(gdp.number) + date.factor, data = data.ww2)
data.ww2.lm.f.df = augment(data.ww2.lm.f)
colnames(data.ww2.lm.f.df) = c("life.expectancy.number" , "gdp.number" , "date" , ".fitted" , ".se.fit" , ".resid" , ".hat" , ".sigma" ,".cooksd" , ".std.resid" )
ggplot(data.ww2.lm.f.df, aes(x = gdp.number, y = .resid)) +
geom_smooth(se=F) +geom_point(alpha=0.1) +
ggtitle('Residual Plot for Linear Model ') +
labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(data.ww2.lm.df, aes(x = .fitted, y = abs(.resid))) +
geom_smooth(se=F) +geom_point(alpha=0.1)+
ggtitle('Residual & Fitted Plot for Linear Model ') +
labs(x='log10 of gdp')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
Make predictions:
ww2.lm.f.grid <-expand.grid(gdp.number = seq(5.5, 11.6, 0.1), date.factor = c('1940s','1950s','1960s','1970s','1980s','1990s','2010s'))
ww2.lm.f.grid$date.factor<-as.factor(ww2.lm.f.grid$date.factor)
ww2.lm.f.grid.predict = predict(data.ww2.lm.f, newdata = ww2.lm.f.grid)
ww2.lm.f.grid.predict = data.frame(ww2.lm.f.grid, life.expectancy = as.vector(ww2.lm.f.grid.predict))
#ggplot(ww2.lm.f.grid.predict, aes(x = gdp.number, y = life.expectancy)) + geom_line() +facet_wrap(~date.factor)
ggplot(ww2.lm.f.grid.predict, aes(x = gdp.number, y = life.expectancy, group = date.factor, color = date.factor)) + geom_line()+
ggtitle('Predicted Life Expectancy per GDP after WW2' ) +
labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model with dates together')
Check the R-squared value for each model:
summary(data.ww2.lm) #Adjusted R-squared: 0.7931
##
## Call:
## lm(formula = life.expectancy.number ~ log10(gdp.number) + date,
## data = data.ww2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.953 -3.498 0.250 4.098 19.428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.37760 0.66294 -20.179 < 2e-16 ***
## log10(gdp.number) 16.35201 0.11934 137.016 < 2e-16 ***
## date1941 -0.26325 0.74836 -0.352 0.72501
## date1942 -0.59908 0.74836 -0.801 0.42343
## date1943 -0.75554 0.74836 -1.010 0.31271
## date1944 -1.17385 0.74836 -1.569 0.11678
## date1945 0.04849 0.74836 0.065 0.94834
## date1946 2.83863 0.74836 3.793 0.00015 ***
## date1947 4.08149 0.74836 5.454 5.04e-08 ***
## date1948 5.28258 0.74837 7.059 1.79e-12 ***
## date1949 6.16455 0.74838 8.237 < 2e-16 ***
## date1950 7.24860 0.74839 9.686 < 2e-16 ***
## date1951 7.30784 0.74840 9.765 < 2e-16 ***
## date1952 7.85601 0.74841 10.497 < 2e-16 ***
## date1953 8.19800 0.74844 10.954 < 2e-16 ***
## date1954 8.59137 0.74846 11.479 < 2e-16 ***
## date1955 8.95452 0.74848 11.964 < 2e-16 ***
## date1956 9.24828 0.74851 12.356 < 2e-16 ***
## date1957 9.51004 0.74854 12.705 < 2e-16 ***
## date1958 10.01966 0.74856 13.385 < 2e-16 ***
## date1959 10.24416 0.74860 13.684 < 2e-16 ***
## date1960 10.50180 0.74865 14.028 < 2e-16 ***
## date1961 10.86701 0.74868 14.515 < 2e-16 ***
## date1962 11.14669 0.74873 14.887 < 2e-16 ***
## date1963 11.46295 0.74878 15.309 < 2e-16 ***
## date1964 11.70125 0.74885 15.626 < 2e-16 ***
## date1965 11.87362 0.74891 15.854 < 2e-16 ***
## date1966 12.11740 0.74897 16.179 < 2e-16 ***
## date1967 12.40403 0.74902 16.560 < 2e-16 ***
## date1968 12.53720 0.74909 16.737 < 2e-16 ***
## date1969 12.63498 0.74918 16.865 < 2e-16 ***
## date1970 12.94943 0.74927 17.283 < 2e-16 ***
## date1971 13.17221 0.74935 17.578 < 2e-16 ***
## date1972 13.16700 0.74942 17.570 < 2e-16 ***
## date1973 13.59164 0.74949 18.135 < 2e-16 ***
## date1974 13.68842 0.74960 18.261 < 2e-16 ***
## date1975 13.96253 0.74959 18.627 < 2e-16 ***
## date1976 13.91502 0.74967 18.561 < 2e-16 ***
## date1977 14.34394 0.74973 19.132 < 2e-16 ***
## date1978 14.51168 0.74979 19.354 < 2e-16 ***
## date1979 14.65310 0.74985 19.541 < 2e-16 ***
## date1980 15.16758 0.74988 20.227 < 2e-16 ***
## date1981 15.37067 0.74987 20.498 < 2e-16 ***
## date1982 15.64724 0.74984 20.867 < 2e-16 ***
## date1983 15.98470 0.74982 21.318 < 2e-16 ***
## date1984 16.19476 0.74985 21.597 < 2e-16 ***
## date1985 16.45600 0.74985 21.946 < 2e-16 ***
## date1986 16.79364 0.74986 22.396 < 2e-16 ***
## date1987 16.87506 0.74989 22.503 < 2e-16 ***
## date1988 17.06481 0.74991 22.756 < 2e-16 ***
## date1989 17.50013 0.74991 23.336 < 2e-16 ***
## date1990 17.64148 0.74990 23.525 < 2e-16 ***
## date1991 17.88262 0.74984 23.849 < 2e-16 ***
## date1992 18.07015 0.74983 24.099 < 2e-16 ***
## date1993 18.13016 0.74981 24.180 < 2e-16 ***
## date1994 17.91198 0.74983 23.888 < 2e-16 ***
## date1995 17.99906 0.74993 24.001 < 2e-16 ***
## date1996 17.83637 0.75006 23.780 < 2e-16 ***
## date1997 17.67691 0.75019 23.563 < 2e-16 ***
## date1998 17.62551 0.75025 23.493 < 2e-16 ***
## date1999 17.70647 0.75030 23.599 < 2e-16 ***
## date2000 17.85390 0.75038 23.793 < 2e-16 ***
## date2001 17.99869 0.75044 23.984 < 2e-16 ***
## date2002 18.12362 0.75049 24.149 < 2e-16 ***
## date2003 18.23038 0.75056 24.289 < 2e-16 ***
## date2004 18.20683 0.75073 24.252 < 2e-16 ***
## date2005 18.35342 0.75087 24.443 < 2e-16 ***
## date2006 18.43789 0.75104 24.550 < 2e-16 ***
## date2007 18.55279 0.75121 24.697 < 2e-16 ***
## date2008 18.75883 0.75131 24.968 < 2e-16 ***
## date2009 19.24440 0.75126 25.616 < 2e-16 ***
## date2010 19.24323 0.75139 25.610 < 2e-16 ***
## date2011 19.62227 0.75146 26.112 < 2e-16 ***
## date2012 19.92723 0.75157 26.514 < 2e-16 ***
## date2013 20.21041 0.75164 26.888 < 2e-16 ***
## date2014 20.36586 0.75171 27.093 < 2e-16 ***
## date2015 20.67405 0.75177 27.501 < 2e-16 ***
## date2016 20.88265 0.75183 27.776 < 2e-16 ***
## date2017 20.99681 0.75194 27.924 < 2e-16 ***
## date2018 21.10238 0.75204 28.060 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.126 on 10506 degrees of freedom
## Multiple R-squared: 0.7947, Adjusted R-squared: 0.7931
## F-statistic: 514.7 on 79 and 10506 DF, p-value: < 2.2e-16
summary(data.ww2.lm.f) #Adjusted R-squared: 0.7877
##
## Call:
## lm(formula = life.expectancy.number ~ log10(gdp.number) + date.factor,
## data = data.ww2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.817 -3.532 0.253 4.152 18.505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.0418 0.4393 -27.41 <2e-16 ***
## log10(gdp.number) 16.4195 0.1207 136.02 <2e-16 ***
## date.factor1950s 7.1489 0.2400 29.79 <2e-16 ***
## date.factor1960s 10.1473 0.2412 42.07 <2e-16 ***
## date.factor1970s 12.2098 0.2433 50.18 <2e-16 ***
## date.factor1980s 14.7169 0.2442 60.26 <2e-16 ***
## date.factor1990s 16.2585 0.2446 66.47 <2e-16 ***
## date.factor2000s 16.7801 0.2471 67.90 <2e-16 ***
## date.factor2010s 18.7344 0.2561 73.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.205 on 10577 degrees of freedom
## Multiple R-squared: 0.7879, Adjusted R-squared: 0.7877
## F-statistic: 4911 on 8 and 10577 DF, p-value: < 2.2e-16
Write a function to fit a LM for each group:
continent.lm = function(data){
lm(life.expectancy.number ~ log10(gdp.number) + date, data = data)
}
Make a nested data frame:
continent.lm.m = data.ww2 %>%
group_by(continent) %>%
nest()
continent.lm.m.m = map(continent.lm.m$data, continent.lm)
continent.lm.m = mutate(continent.lm.m, model = continent.lm.m.m)
continent.lm.m = mutate(continent.lm.m, .fitted = map2(data, model, add_predictions))
continent.lm.m = mutate(continent.lm.m, .resid = map2(data, model, add_residuals))
continent.lm.m.fitted = unnest(continent.lm.m, .fitted,.resid)
continent.lm.m.fitted<-continent.lm.m.fitted[,-c(10:16)]
ploting the residuals :
ggplot(continent.lm.m.fitted , aes(x=(gdp.number),y= resid))+
geom_point()+ geom_smooth(method='loess',se=F)+
scale_x_log10() +
ggtitle("Residual Plot for Gdp per Capita and Life Expectancy")+
labs(x='log 10 of gdp per capita')
ggplot(continent.lm.m.fitted , aes(x=(gdp.number),y= abs(resid))) +
geom_point()+ geom_smooth(method='loess',se=F)+
scale_x_log10() +
ggtitle("Absolute Residual Plot for Gdp per Capita and Life Expectancy")+
labs(x='log 10 of gdp per capita')
Make model visuable:
continent.lm.m.fitted.df<-as.data.frame(continent.lm.m.fitted)
ggplot(continent.lm.m.fitted.df, aes(x = gdp.number, y = pred, group=date , color= date)) +
geom_line()+scale_x_log10() +
facet_grid(~continent)+
ggtitle('Predicted Life Expectancy per GDP after WW2' ) +
labs(x= 'Log 10 GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model faceting by continents')
Write a function to fit a LM for each group:
continent.lm.f = function(data){
lm(life.expectancy.number ~ log10(gdp.number) + date.factor, data = data)
}
Make a nested data frame:
continent.lm.m.f = data.ww2 %>%
group_by(continent) %>%
nest()
continent.lm.m.f.m = map(continent.lm.m.f$data, continent.lm.f)
continent.lm.m.f = mutate(continent.lm.m.f, model = continent.lm.m.f.m)
continent.lm.m.f = mutate(continent.lm.m.f, .fitted = map2(data, model, add_predictions))
continent.lm.m.f = mutate(continent.lm.m.f, .resid = map2(data, model, add_residuals))
continent.lm.m.f.fitted = unnest(continent.lm.m.f, .fitted,.resid)
continent.lm.m.f.fitted<-continent.lm.m.f.fitted[,-c(10:16)]
plotting the residuals :
ggplot(continent.lm.m.f.fitted , aes(x=(gdp.number),y= resid))+
geom_point()+ geom_smooth(method='loess',se=F)+
scale_x_log10() +
ggtitle("Residual Plot for Gdp per Capita and Life Expectancy")+
labs(x='log 10 of gdp per capita')
ggplot(continent.lm.m.f.fitted , aes(x=(gdp.number),y= abs(resid))) +
geom_point()+ geom_smooth(method='loess',se=F)+
scale_x_log10() +
ggtitle("Absolute Residual Plot for Gdp per Capita and Life Expectancy")+
labs(x='log 10 of gdp per capita')
Visual graph:
ggplot(continent.lm.m.f.fitted, aes(x = gdp.number, y = pred, group = date.factor, color = date.factor)) +
geom_line()+scale_x_log10()+
facet_grid(~continent)+
ggtitle('Predicted Life Expectancy per GDP after WW2' ) +
labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model considering continents and seperate dates', caption = 'Faceted by Continent')
ggplot(continent.lm.m.f.fitted, aes(x = gdp.number, y = pred, group = continent, color = continent)) +
geom_line()+scale_x_log10()+
facet_grid(~date.factor)+
ggtitle('Predicted Life Expectancy per GDP after WW2' ) +
labs(x= 'Log 10 of GDP per capita' , y= 'Life Expectancy', subtitle = 'Using linear model considering continents with dates together', caption = 'Faceted by Date')
-As the date increases the GDP as well increase! -The GDP differs among continents too -And the life expectancy increases as well as the gdp increases